Spatial Functions
Contents
Spatial Functions#
Data source : https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page
from sqlalchemy import create_engine
import geopandas as gpd
import requests
import shutil
import folium
import json
from pathlib import Path
engine_str = "postgresql+psycopg2://docker:docker@0.0.0.0:25432/restaurants"
engine = create_engine(engine_str)
%load_ext sql
%sql $engine.url
'Connected: docker@restaurants'
Download MapPluto#
def download_file(url: str, local_path: Path):
with requests.get(url, stream=True) as r:
with open(local_path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_path
url = 'https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_mappluto_22v2_fgdb.zip'
local_filename = url.split('/')[-1]
downloads = Path('../data/tmp/')
local_path = downloads.joinpath(local_filename)
if not downloads.exists():
downloads.mkdir(parents=True)
# guard against unnecessary re-downloading
if not local_path.exists():
download_file(url, local_path)
gdb = 'MapPLUTO22v2.gdb'
layer = 'MapPLUTO_22v2_clipped'
We can actually access python variables in the bash kernel. Lets use it!
%%bash -s "$local_path" "$gdb"
echo $1 $2
../data/tmp/nyc_mappluto_22v2_fgdb.zip MapPLUTO22v2.gdb
%%bash -s "$local_path" "$gdb" "$layer"
gdb="../data/tmp/$2"
if [ ! -d $gdb ]; then
unzip -o $1 -d ../data/tmp/ > /dev/null
fi
# extract relevant info
echo $(gdalsrsinfo -e $gdb | grep "^EPSG:")
ogrinfo -so $gdb $3 | grep 'Geometry\|Feature Count\|Extent'
# import do db
ogr2ogr -f "PostgreSQL" \
PG:"host='0.0.0.0' port='25432' user='docker' password='docker' dbname='restaurants'" $gdb $3 \
-nlt PROMOTE_TO_MULTI \
-nln "mappluto" \
-lco GEOMETRY_NAME=geom \
-overwrite
EPSG:2263
Geometry: Multi Polygon
Feature Count: 857020
Extent: (913128.926363, 120048.985951) - (1067335.951282, 272811.183060)
Geometry Column = Shape
Process is interrupted.
%%sql
select column_name, data_type from information_schema.columns
where table_schema = 'public'
and table_name = 'mappluto'
LIMIT 5;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
5 rows affected.
| column_name | data_type |
|---|---|
| objectid | integer |
| borough | character varying |
| block | integer |
| lot | smallint |
| cd | smallint |
Queens is the only borough that matters. We will create a separate mappluto_queens table for this demo.
%sql SELECT DISTINCT borough FROM mappluto;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
5 rows affected.
| borough |
|---|
| QN |
| SI |
| BK |
| MN |
| BX |
%%sql
DROP TABLE IF EXISTS mappluto_nyc;
DROP TABLE IF EXISTS mappluto_queens;
CREATE TABLE mappluto_nyc as SELECT objectid, borough, zipcode, ownertype, yearbuilt, assesstot, geom FROM mappluto;
CREATE TABLE mappluto_queens as SELECT * FROM mappluto_nyc WHERE borough='QN';
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
Done.
857020 rows affected.
324248 rows affected.
[]
Spatial Data Quality Check#
Create indexes on newly created tables
%%sql
CREATE INDEX IF NOT EXISTS mappluto_nyc_geom_idx ON mappluto_nyc USING gist(geom);
CREATE INDEX IF NOT EXISTS mappluto_queens_geom_idx ON mappluto_queens USING gist(geom);
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
We will be using the new york counties table we created in the last exercise. Let’s examine the SRIDs we have.
%sql SELECT state_name, ST_SRID(geom) as geom_srid, ST_SRID(geom_utm18n) as geom_utm18n_srid FROM cb_2020_ny_county_500k LIMIT 1;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1 rows affected.
| state_name | geom_srid | geom_utm18n_srid |
|---|---|---|
| New York | 4269 | 3725 |
%sql SELECT ST_SRID(geom) as geom_srid FROM mappluto_queens LIMIT 1;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1 rows affected.
| geom_srid |
|---|
| 2263 |
Check if geometries simple/valid
%%sql
SELECT *
FROM mappluto_queens
WHERE ST_IsValid(geom) is not true
LIMIT 100;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
| objectid | borough | zipcode | ownertype | yearbuilt | assesstot | geom |
|---|
%%sql
SELECT *
FROM mappluto_queens
WHERE ST_IsSimple(geom) is not true
LIMIT 100;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
| objectid | borough | zipcode | ownertype | yearbuilt | assesstot | geom |
|---|
%%sql
SELECT *
FROM cb_2020_ny_county_500k
WHERE ST_IsValid(geom) is not true
LIMIT 100;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
| ogc_fid | statefp | countyfp | countyns | affgeoid | geoid | name | namelsad | stusps | state_name | lsad | aland | awater | geom | geom_utm18n |
|---|
%%sql
SELECT *
FROM cb_2020_ny_county_500k
WHERE ST_IsSimple(geom) is not true
LIMIT 100;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
| ogc_fid | statefp | countyfp | countyns | affgeoid | geoid | name | namelsad | stusps | state_name | lsad | aland | awater | geom | geom_utm18n |
|---|
Visual Verification of SRID assignments#
We export the tables to GeoJSON for creating leaflet web maps via folium
%%sql pluto_sample <<
SELECT ST_AsGeoJSON(subq) as geojson
FROM (
SELECT
objectid,
zipcode,
ownertype,
yearbuilt,
assesstot,
ST_Transform(geom, 4326) as geom
FROM mappluto_queens ORDER BY assesstot DESC LIMIT 1000
) as subq;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1000 rows affected.
Returning data to local variable pluto_sample
%%sql ny_county <<
SELECT ST_AsGeoJSON(subq.*) as geojson
FROM (
SELECT
ogc_fid,
statefp,
countyfp,
countyns,
affgeoid,
geoid,
name,
namelsad,
stusps,
state_name,
lsad ,
aland,
awater,
ST_Transform(geom, 4326) as geom
FROM cb_2020_ny_county_500k
) as subq;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
62 rows affected.
Returning data to local variable ny_county
QUEENS = [40.7292, -73.9049]
def to_map(result_set, location=QUEENS, zoom=13, fillcolor='orange'):
m = folium.Map(location=location, zoom_start=zoom)
for _, r in result_set.DataFrame().iterrows():
geojson = json.loads(r.geojson)
if 'properties' in geojson.keys():
properties = geojson['properties']
else:
properties = None
popup_cols = [c for c in properties.keys() if 'geom' not in c.lower()] if properties is not None else []
geo_j = folium.GeoJson(data=geojson,
style_function=lambda x: {'fillColor': fillcolor},
popup=folium.GeoJsonPopup(popup_cols)
)
geo_j.add_to(m)
return m
to_map(pluto_sample)
to_map(ny_county, zoom = 10, fillcolor='purple')
Non-Spatial Data Quality Check#
%sql SELECT yearbuilt FROM mappluto_queens WHERE yearbuilt < 1800 ORDER BY yearbuilt DESC LIMIT 10;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
7 rows affected.
| yearbuilt |
|---|
| 1785 |
| 1775 |
| 1735 |
| 1729 |
| 1706 |
| 1694 |
| 1661 |
yearbuilt=0 should be a null value.
%%sql
UPDATE mappluto SET yearbuilt=NULL WHERE yearbuilt=0;
UPDATE mappluto_nyc SET yearbuilt=NULL WHERE yearbuilt=0;
UPDATE mappluto_queens SET yearbuilt = NULL WHERE yearbuilt=0;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
40240 rows affected.
40240 rows affected.
0 rows affected.
[]
%sql SELECT * FROM mappluto_queens WHERE zipcode <= 0;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
0 rows affected.
| objectid | borough | zipcode | ownertype | yearbuilt | assesstot | geom |
|---|
%%sql
SELECT COUNT(*) FROM mappluto_queens WHERE assesstot <= 0;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1 rows affected.
| count |
|---|
| 1220 |
assesstot_low = %sql SELECT ST_AsGeoJSON(subq.*) as geojson FROM (SELECT ownertype, assesstot, ST_Transform(geom, 4326) as geom FROM mappluto_queens WHERE assesstot <=0) as subq;
to_map(assesstot_low, fillcolor='red')
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1220 rows affected.
It looks like many of the assesstot=0 examples are public property. I’m not sure these should be considered NULL, so we’ll leave them and account for it in our aggregations.
Spatial Calculations#
The mappluto geom column is in EPSG:2263, a projection in US feet.
%%sql
SELECT
objectid,
ROUND(ST_Area(geom)::numeric,2) as area_ft2,
ROUND(ST_Perimeter(geom)::numeric,2) as perimeter_ft,
DATE_PART('year', AGE(CURRENT_DATE, TO_DATE(yearbuilt::varchar(4), 'YYYY')))::int as age_years
FROM mappluto_queens
LIMIT 10;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
| objectid | area_ft2 | perimeter_ft | age_years |
|---|---|---|---|
| 384318 | 2504.11 | 228.04 | 94 |
| 384319 | 1780.32 | 220.32 | 82 |
| 384320 | 2085.68 | 235.43 | 82 |
| 384321 | 2731.72 | 268.54 | 122 |
| 384322 | 19367.13 | 623.09 | 92 |
| 384323 | 1978.28 | 232.74 | 52 |
| 384324 | 2514.51 | 251.96 | 102 |
| 384325 | 2582.04 | 253.30 | 14 |
| 384326 | 2564.72 | 284.54 | 112 |
| 384327 | 2520.12 | 243.51 | 52 |
%%sql
SELECT
zipcode,
ROUND(AVG(ST_Area(geom))::numeric,2) as mean_area_ft2,
ROUND(AVG(assesstot)::numeric,2) as mean_assesstot_dollar
FROM mappluto_queens
-- edge case guards
WHERE assesstot > 0 AND zipcode is NOT NULL
GROUP BY zipcode
ORDER BY mean_area_ft2 DESC
LIMIT 10;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
| zipcode | mean_area_ft2 | mean_assesstot_dollar |
|---|---|---|
| 11371 | 11776044.15 | 1925443350.00 |
| 11359 | 3111281.36 | 33181770.00 |
| 11697 | 1637333.54 | 5959041.85 |
| 11695 | 1547725.00 | 31394250.00 |
| 11430 | 799130.72 | 11205355.00 |
| 11005 | 687929.59 | 24933921.43 |
| 11109 | 35344.61 | 37012320.50 |
| 11422 | 30900.67 | 1169964.75 |
| 11451 | 29977.68 | 411300.00 |
| 11693 | 23625.09 | 176013.70 |
%%sql
WITH subq AS (
SELECT
zipcode,
DATE_PART('year', AGE(CURRENT_DATE, TO_DATE(yearbuilt::varchar(4), 'YYYY')))::int as age_years
FROM mappluto_queens
)
SELECT zipcode, mode() WITHIN GROUP (ORDER BY age_years) as age_mode
FROM subq
GROUP BY zipcode
-- ensure at least one non-null age
HAVING SUM(CASE WHEN age_years IS NULL then 0 else 1 end) >= 1
LIMIT 10;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
| zipcode | age_mode |
|---|---|
| 11001 | 67 |
| 11004 | 72 |
| 11005 | 50 |
| 11040 | 72 |
| 11101 | 91 |
| 11102 | 92 |
| 11103 | 97 |
| 11104 | 95 |
| 11105 | 92 |
| 11106 | 92 |
%%sql
WITH subq AS (
SELECT
zipcode,
ROUND(ST_Area(geom)::numeric, 2) as area_ft2,
DATE_PART('year', AGE(CURRENT_DATE, TO_DATE(yearbuilt::varchar(4), 'YYYY')))::int as age_years
FROM mappluto_queens
)
SELECT
zipcode,
percentile_disc(0.5) WITHIN GROUP (ORDER BY area_ft2) AS median_area_ft2,
percentile_disc(0.5) WITHIN GROUP (ORDER BY age_years) AS median_age
FROM subq
GROUP BY zipcode
-- ensure at least one non-null age
HAVING SUM(CASE WHEN age_years IS NULL then 0 else 1 end) >= 1
LIMIT 10;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
10 rows affected.
| zipcode | median_area_ft2 | median_age |
|---|---|---|
| 11001 | 3975.97 | 72 |
| 11004 | 4028.26 | 72 |
| 11005 | 100726.73 | 50 |
| 11040 | 4264.12 | 72 |
| 11101 | 3942.21 | 91 |
| 11102 | 2536.56 | 92 |
| 11103 | 2478.47 | 94 |
| 11104 | 2207.72 | 93 |
| 11105 | 2324.60 | 92 |
| 11106 | 2472.45 | 92 |
Simplification#
ST_Simplify returns a “simplified” geometry using Douglas-Peucker algorithm. Topology is not preserved.
tolerance = 0.001
%%sql simp001 <<
WITH subq AS (
SELECT
name,
ST_Simplify(ST_Transform(geom, 4326), 0.001) as geom
FROM cb_2020_ny_county_500k
)
SELECT ST_AsGeoJSON(subq.*) as geojson
FROM subq
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
62 rows affected.
Returning data to local variable simp001
to_map(simp001, zoom=12, fillcolor='green')
tolerance = 0.01
%%sql simp01 <<
WITH subq AS (
SELECT
name,
ST_Simplify(ST_Transform(geom, 4326), 0.01) as geom
FROM cb_2020_ny_county_500k
)
SELECT ST_AsGeoJSON(subq.*) as geojson
FROM subq
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
62 rows affected.
Returning data to local variable simp01
to_map(simp01, zoom=12, fillcolor='green')